Author : Priyanuj Saikia
import pandas as pd
import numpy as np
import plotly as py
import plotly.graph_objs as go
import seaborn as sns
sns.set(style="whitegrid")
import matplotlib.pyplot as plt
from matplotlib.pyplot import xticks
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from sklearn.metrics import r2_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
df=pd.read_csv('./HousingNew.csv') #importing the data set
df.head()
| ID | price | lotsize | bedrooms | bathrms | stories | garagepl | recroom | fullbase | airco | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 42000.0 | 5850 | 3 | 1 | 2 | 1 | no | yes | no |
| 1 | 2 | 38500.0 | 4000 | 2 | 1 | 1 | 0 | no | no | no |
| 2 | 3 | 49500.0 | 3060 | 3 | 1 | 1 | 0 | no | no | no |
| 3 | 4 | 60500.0 | 6650 | 3 | 1 | 2 | 0 | yes | no | no |
| 4 | 5 | 61000.0 | 6360 | 2 | 1 | 1 | 0 | no | no | no |
del df['ID'] #deleting the ID column
display(df.info())
<class 'pandas.core.frame.DataFrame'> RangeIndex: 546 entries, 0 to 545 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 price 546 non-null float64 1 lotsize 546 non-null int64 2 bedrooms 546 non-null int64 3 bathrms 546 non-null int64 4 stories 546 non-null int64 5 garagepl 546 non-null int64 6 recroom 546 non-null object 7 fullbase 546 non-null object 8 airco 546 non-null object dtypes: float64(1), int64(5), object(3) memory usage: 38.5+ KB
None
Analysis : No null values treatment is required
meta_data=pd.DataFrame({'No. of Unique':df.nunique(),'Data Types':df.dtypes})
display(meta_data)
| No. of Unique | Data Types | |
|---|---|---|
| price | 219 | float64 |
| lotsize | 284 | int64 |
| bedrooms | 6 | int64 |
| bathrms | 4 | int64 |
| stories | 4 | int64 |
| garagepl | 4 | int64 |
| recroom | 2 | object |
| fullbase | 2 | object |
| airco | 2 | object |
Analysis : 7 independent variables are categorical, 1 independent variable is continous and dependent variable is continous
Dependent Variable
- Visualization of Price
Independent Variable
- Visualising Numeric Variables
temp_df=pd.DataFrame(df["price"])
data = []
for index, column_name in enumerate(temp_df):
data.append(go.Box(y=temp_df.iloc[:, index],name=column_name))
layout = go.Layout(yaxis=dict(title="Frequency",zeroline=False),boxmode='group')
fig = go.Figure(data=data, layout=layout)
py.offline.iplot(fig)
Analysis : The house prices has median 62k and min and max are 25k and 190k respectively. Above 130k seems to be outlier and the price is positively skewed
Visualising Numeric variable
display(df.describe())
| price | lotsize | bedrooms | bathrms | stories | garagepl | |
|---|---|---|---|---|---|---|
| count | 546.000000 | 546.000000 | 546.000000 | 546.000000 | 546.000000 | 546.000000 |
| mean | 68121.597070 | 5150.265568 | 2.965201 | 1.285714 | 1.807692 | 0.692308 |
| std | 26702.670926 | 2168.158725 | 0.737388 | 0.502158 | 0.868203 | 0.861307 |
| min | 25000.000000 | 1650.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 |
| 25% | 49125.000000 | 3600.000000 | 2.000000 | 1.000000 | 1.000000 | 0.000000 |
| 50% | 62000.000000 | 4600.000000 | 3.000000 | 1.000000 | 2.000000 | 0.000000 |
| 75% | 82000.000000 | 6360.000000 | 3.000000 | 2.000000 | 2.000000 | 1.000000 |
| max | 190000.000000 | 16200.000000 | 6.000000 | 4.000000 | 4.000000 | 3.000000 |
Visualising continous variable
sns.scatterplot(data=df, x="lotsize", y="price")
plt.show()
display(df[['lotsize','price']].corr())
| lotsize | price | |
|---|---|---|
| lotsize | 1.000000 | 0.535796 |
| price | 0.535796 | 1.000000 |
Analysis : Lotsize and Price are positively correlated with correlation coefficient 0.54
Categorical Variable Visualisation
plt.figure(figsize=(12,6))
plt.title("bedrooms")
sns.countplot(data=df,x="bedrooms", palette=("plasma"))
xticks(rotation = 90)
plt.show()
Analysis : Note that 3 bed room house are very famous
plt.figure(figsize=(20, 12))
plt.subplot(3,3,1)
sns.boxplot(x = 'bedrooms', y = 'price', data = df, palette=("plasma"))
plt.subplot(3,3,2)
sns.boxplot(x = 'bathrms', y = 'price', data = df, palette=("plasma"))
plt.subplot(3,3,3)
sns.boxplot(x = 'stories', y = 'price', data = df ,palette=("plasma"))
plt.subplot(3,3,4)
sns.boxplot(x = 'garagepl', y = 'price', data = df , palette=("plasma"))
plt.subplot(3,3,5)
sns.boxplot(x = 'recroom', y = 'price', data = df , palette=("plasma"))
plt.subplot(3,3,6)
sns.boxplot(x = 'fullbase', y = 'price', data = df , palette=("plasma"))
plt.subplot(3,3,7)
sns.boxplot(x = 'airco', y = 'price', data = df , palette=("plasma"))
plt.show()
Analysis :
Average price of air conditioned rooms are more expensive.
Average price of houses with recreation room is more but most expensive houses donot have recreation room
Average price increases with increase in the number of bedroom, bathroom, stories, garage park.
Inclusion of basement doesnot make much difference in the price
plt.figure(figsize=(14,8))
sns.barplot(x = 'bedrooms', y = 'price', hue = 'bathrms',data = df,palette=("plasma"))
plt.show()
Analysis : 4 bathrooms are found only in 4 bedroom house
plt.figure(figsize=(20,6))
plt.subplot(1,2,1)
sns.barplot(x = 'recroom', y = 'price', hue = 'airco',data = df,palette=("plasma"))
plt.subplot(1,2,2)
sns.barplot(x = 'fullbase', y = 'price', hue = 'airco',data = df,palette=("plasma"))
plt.show()
Analysis : Air Conditioned with recreation room is more expensive then air condition with full basement
- For regression analysis we need numerical values instead of string, so encoding is necessary.
Visualising Categorical Variables
categorical_df=meta_data[meta_data['No. of Unique']<8]
categorical_df.index
data=[]
for col in categorical_df.index.tolist():
data.append([col,df[col].unique()])
unique_val_df=pd.DataFrame(data,columns=['Columns','Unique Values'])
display(unique_val_df)
| Columns | Unique Values | |
|---|---|---|
| 0 | bedrooms | [3, 2, 4, 1, 5, 6] |
| 1 | bathrms | [1, 2, 3, 4] |
| 2 | stories | [2, 1, 3, 4] |
| 3 | garagepl | [1, 0, 2, 3] |
| 4 | recroom | [no, yes] |
| 5 | fullbase | [yes, no] |
| 6 | airco | [no, yes] |
Analysis : Encoding 'yes' to 1 and 'no' to 0.
df['recroom']=np.where(df['recroom']=='yes',1,0)
df['recroom']=df['recroom'].astype(int)
df['fullbase']=np.where(df['fullbase']=='yes',1,0)
df['fullbase']=df['fullbase'].astype(int)
df['airco']=np.where(df['airco']=='yes',1,0)
df['airco']=df['airco'].astype(int)
display(df.head())
| price | lotsize | bedrooms | bathrms | stories | garagepl | recroom | fullbase | airco | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 42000.0 | 5850 | 3 | 1 | 2 | 1 | 0 | 1 | 0 |
| 1 | 38500.0 | 4000 | 2 | 1 | 1 | 0 | 0 | 0 | 0 |
| 2 | 49500.0 | 3060 | 3 | 1 | 1 | 0 | 0 | 0 | 0 |
| 3 | 60500.0 | 6650 | 3 | 1 | 2 | 0 | 1 | 0 | 0 |
| 4 | 61000.0 | 6360 | 2 | 1 | 1 | 0 | 0 | 0 | 0 |
- 'bedrooms', 'bathrms', 'stories', 'garagepl' can be treated as dummies
df['bedrooms']=df["bedrooms"].astype(str)+'_'+'bedrooms'
df['bathrms']=df["bathrms"].astype(str)+'_'+'bathrms'
df['stories']=df["stories"].astype(str)+'_'+'stories'
df['garagepl']=df["garagepl"].astype(str)+'_'+'garagepl'
display(df.head())
| price | lotsize | bedrooms | bathrms | stories | garagepl | recroom | fullbase | airco | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 42000.0 | 5850 | 3_bedrooms | 1_bathrms | 2_stories | 1_garagepl | 0 | 1 | 0 |
| 1 | 38500.0 | 4000 | 2_bedrooms | 1_bathrms | 1_stories | 0_garagepl | 0 | 0 | 0 |
| 2 | 49500.0 | 3060 | 3_bedrooms | 1_bathrms | 1_stories | 0_garagepl | 0 | 0 | 0 |
| 3 | 60500.0 | 6650 | 3_bedrooms | 1_bathrms | 2_stories | 0_garagepl | 1 | 0 | 0 |
| 4 | 61000.0 | 6360 | 2_bedrooms | 1_bathrms | 1_stories | 0_garagepl | 0 | 0 | 0 |
#function for dummy variable creation
def create_dummy_variable(dataframe,column_name):
dummy_values = pd.get_dummies(dataframe[column_name])
dataframe = pd.concat([dataframe,dummy_values],axis=1)
dataframe.drop([column_name],axis=1,inplace=True)
return dataframe
df = create_dummy_variable(df,"bedrooms")
df = create_dummy_variable(df,"bathrms")
df = create_dummy_variable(df,"stories")
df = create_dummy_variable(df,"garagepl")
display(df.head())
display(df.describe())
| price | lotsize | recroom | fullbase | airco | 1_bedrooms | 2_bedrooms | 3_bedrooms | 4_bedrooms | 5_bedrooms | ... | 3_bathrms | 4_bathrms | 1_stories | 2_stories | 3_stories | 4_stories | 0_garagepl | 1_garagepl | 2_garagepl | 3_garagepl | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 42000.0 | 5850 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
| 1 | 38500.0 | 4000 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 2 | 49500.0 | 3060 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 3 | 60500.0 | 6650 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
| 4 | 61000.0 | 6360 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
5 rows × 23 columns
| price | lotsize | recroom | fullbase | airco | 1_bedrooms | 2_bedrooms | 3_bedrooms | 4_bedrooms | 5_bedrooms | ... | 3_bathrms | 4_bathrms | 1_stories | 2_stories | 3_stories | 4_stories | 0_garagepl | 1_garagepl | 2_garagepl | 3_garagepl | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 546.000000 | 546.000000 | 546.000000 | 546.000000 | 546.000000 | 546.000000 | 546.000000 | 546.000000 | 546.000000 | 546.000000 | ... | 546.000000 | 546.000000 | 546.000000 | 546.000000 | 546.000000 | 546.000000 | 546.000000 | 546.000000 | 546.000000 | 546.000000 |
| mean | 68121.597070 | 5150.265568 | 0.177656 | 0.349817 | 0.316850 | 0.003663 | 0.249084 | 0.551282 | 0.173993 | 0.018315 | ... | 0.018315 | 0.001832 | 0.415751 | 0.435897 | 0.073260 | 0.075092 | 0.549451 | 0.230769 | 0.197802 | 0.021978 |
| std | 26702.670926 | 2168.158725 | 0.382573 | 0.477349 | 0.465675 | 0.060467 | 0.432879 | 0.497819 | 0.379451 | 0.134211 | ... | 0.134211 | 0.042796 | 0.493303 | 0.496329 | 0.260802 | 0.263781 | 0.498005 | 0.421711 | 0.398707 | 0.146746 |
| min | 25000.000000 | 1650.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 49125.000000 | 3600.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 50% | 62000.000000 | 4600.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 |
| 75% | 82000.000000 | 6360.000000 | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 |
| max | 190000.000000 | 16200.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | ... | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
8 rows × 23 columns
Analysis : Dummy variables successfully created.
np.random.seed(0) #to be consistent with the randomness every time we execute
df_train , df_test = train_test_split(df,train_size=0.7,test_size=0.3,random_state=100)
display(df_train.shape)
display(df_test.shape)
(382, 23)
(164, 23)
Remark : 70% train and 30% test data successfully ceated.
It is important to rescale the variables to have a comparable scale or else we will have very inconsistency in the coeficients while fitting a regression model as a result of which the model result will be annoying. Two ways of rescaling:
- Min-Max scaling
- Standardisation (mean=0, sigma=1)
Here we shall use Min-Max scaling.
scale = MinMaxScaler()
rescale_cols = ['price','lotsize']
rescale_values=scale.fit_transform(df_train[rescale_cols])
df_train[rescale_cols]=scale.fit_transform(df_train[rescale_cols])
display(df_train.head())
| price | lotsize | recroom | fullbase | airco | 1_bedrooms | 2_bedrooms | 3_bedrooms | 4_bedrooms | 5_bedrooms | ... | 3_bathrms | 4_bathrms | 1_stories | 2_stories | 3_stories | 4_stories | 0_garagepl | 1_garagepl | 2_garagepl | 3_garagepl | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 46 | 0.096970 | 0.120346 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 50 | 0.181818 | 0.316017 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
| 483 | 0.160606 | 0.203463 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 19 | 0.121212 | 0.202251 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 159 | 0.230303 | 0.200693 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
5 rows × 23 columns
We will be using mixed approach to find the relevent features.
- Identify features using Recurcive Feature Elimination (RFE) Recursive feature elimination (RFE) is a feature selection method that fits a model and removes the weakest feature (or features) until the specified number of features is reached.
- Manually find the correct fit
#creating two sets one with 'price' and other without 'price'
y_train = df_train.pop('price').copy()
x_train = df_train.copy()
regression = LinearRegression()
regression.fit(x_train,y_train)
#We are selecting 17 out of 22 independent variable
rfe = RFE(regression,n_features_to_select=17)
rfe = rfe.fit(x_train,y_train)
display(list(zip(x_train.columns,rfe.support_,rfe.ranking_)))
col = x_train.columns[rfe.support_]
print("RFE suggested variables: ",col)
[('lotsize', True, 1),
('recroom', True, 1),
('fullbase', True, 1),
('airco', True, 1),
('1_bedrooms', True, 1),
('2_bedrooms', True, 1),
('3_bedrooms', False, 6),
('4_bedrooms', False, 4),
('5_bedrooms', False, 3),
('6_bedrooms', True, 1),
('1_bathrms', True, 1),
('2_bathrms', True, 1),
('3_bathrms', False, 5),
('4_bathrms', True, 1),
('1_stories', True, 1),
('2_stories', True, 1),
('3_stories', True, 1),
('4_stories', True, 1),
('0_garagepl', True, 1),
('1_garagepl', False, 2),
('2_garagepl', True, 1),
('3_garagepl', True, 1)]
RFE suggested variables: Index(['lotsize', 'recroom', 'fullbase', 'airco', '1_bedrooms', '2_bedrooms',
'6_bedrooms', '1_bathrms', '2_bathrms', '4_bathrms', '1_stories',
'2_stories', '3_stories', '4_stories', '0_garagepl', '2_garagepl',
'3_garagepl'],
dtype='object')
model_count = 0
def statsmodel_summary(y_var,x_var) :
global model_count
model_count = model_count + 1
text = "MODEL - " + str(model_count)
print(text)
x_var_const = sm.add_constant(x_var) # adding constant
lm = sm.OLS(y_var,x_var_const).fit() # calculating the fit
print(lm.summary()) # print summary for analysis
display_vif(x_var_const.drop(['const'],axis=1))
return x_var_const , lm
def display_vif(x) :
# Calculate the VIFs for the new model
vif = pd.DataFrame()
X = x
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.set_index("Features")
vif = vif.sort_values(by = "VIF", ascending = False)
df = pd.DataFrame(vif.VIF).style.applymap(color_code_vif_values)
display(df)
def color_code_vif_values(val):
"""
color coding the numerical values
"""
if val > 10 : color = 'red'
elif val > 5 and val <= 10 : color = 'blue'
elif val > 0 and val <= 5 : color = 'darkgreen'
else : color = 'black'
return 'color: %s' % color
x_new = x_train[col]
x_new,lm_new = statsmodel_summary(y_train,x_new)
MODEL - 1
OLS Regression Results
==============================================================================
Dep. Variable: price R-squared: 0.666
Model: OLS Adj. R-squared: 0.651
Method: Least Squares F-statistic: 45.40
Date: Thu, 21 Jan 2021 Prob (F-statistic): 1.41e-76
Time: 14:41:58 Log-Likelihood: 358.09
No. Observations: 382 AIC: -682.2
Df Residuals: 365 BIC: -615.1
Df Model: 16
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const 0.2900 0.035 8.359 0.000 0.222 0.358
lotsize 0.2551 0.031 8.297 0.000 0.195 0.316
recroom 0.0429 0.014 2.991 0.003 0.015 0.071
fullbase 0.0431 0.012 3.583 0.000 0.019 0.067
airco 0.0707 0.012 5.937 0.000 0.047 0.094
1_bedrooms -0.0654 0.098 -0.668 0.504 -0.258 0.127
2_bedrooms -0.0256 0.014 -1.860 0.064 -0.053 0.001
6_bedrooms 0.0475 0.069 0.684 0.494 -0.089 0.184
1_bathrms -0.1844 0.041 -4.533 0.000 -0.264 -0.104
2_bathrms -0.1010 0.041 -2.444 0.015 -0.182 -0.020
4_bathrms 0.3148 0.113 2.787 0.006 0.093 0.537
1_stories 0.0049 0.014 0.349 0.727 -0.023 0.032
2_stories 0.0431 0.012 3.595 0.000 0.020 0.067
3_stories 0.1162 0.019 6.120 0.000 0.079 0.153
4_stories 0.1259 0.018 7.077 0.000 0.091 0.161
0_garagepl -0.0508 0.013 -4.021 0.000 -0.076 -0.026
2_garagepl 0.0321 0.015 2.090 0.037 0.002 0.062
3_garagepl -0.0538 0.042 -1.292 0.197 -0.136 0.028
==============================================================================
Omnibus: 72.975 Durbin-Watson: 2.056
Prob(Omnibus): 0.000 Jarque-Bera (JB): 191.789
Skew: 0.914 Prob(JB): 2.26e-42
Kurtosis: 5.951 Cond. No. 2.41e+15
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 1.84e-28. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
| VIF | |
|---|---|
| Features | |
| 1_stories | 35.210000 |
| 2_stories | 30.950000 |
| 1_bathrms | 13.080000 |
| 2_bathrms | 12.880000 |
| 4_stories | 7.250000 |
| 3_stories | 6.080000 |
| 0_garagepl | 1.620000 |
| 2_garagepl | 1.570000 |
| 2_bedrooms | 1.530000 |
| 4_bathrms | 1.350000 |
| fullbase | 1.340000 |
| 3_garagepl | 1.270000 |
| lotsize | 1.260000 |
| recroom | 1.240000 |
| airco | 1.240000 |
| 6_bedrooms | 1.020000 |
| 1_bedrooms | 1.020000 |
Analyis - Delete feature with highest p value i.e. '1_stories' for a stable model
Model Attribute - Adj. R-squared:0.651
x_new.pop('1_stories')
x_new,lm_new = statsmodel_summary(y_train,x_new)
MODEL - 2
OLS Regression Results
==============================================================================
Dep. Variable: price R-squared: 0.666
Model: OLS Adj. R-squared: 0.651
Method: Least Squares F-statistic: 45.40
Date: Thu, 21 Jan 2021 Prob (F-statistic): 1.41e-76
Time: 14:41:59 Log-Likelihood: 358.09
No. Observations: 382 AIC: -682.2
Df Residuals: 365 BIC: -615.1
Df Model: 16
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const 0.2949 0.045 6.604 0.000 0.207 0.383
lotsize 0.2551 0.031 8.297 0.000 0.195 0.316
recroom 0.0429 0.014 2.991 0.003 0.015 0.071
fullbase 0.0431 0.012 3.583 0.000 0.019 0.067
airco 0.0707 0.012 5.937 0.000 0.047 0.094
1_bedrooms -0.0654 0.098 -0.668 0.504 -0.258 0.127
2_bedrooms -0.0256 0.014 -1.860 0.064 -0.053 0.001
6_bedrooms 0.0475 0.069 0.684 0.494 -0.089 0.184
1_bathrms -0.1844 0.041 -4.533 0.000 -0.264 -0.104
2_bathrms -0.1010 0.041 -2.444 0.015 -0.182 -0.020
4_bathrms 0.3148 0.113 2.787 0.006 0.093 0.537
2_stories 0.0383 0.013 3.024 0.003 0.013 0.063
3_stories 0.1113 0.023 4.808 0.000 0.066 0.157
4_stories 0.1210 0.022 5.435 0.000 0.077 0.165
0_garagepl -0.0508 0.013 -4.021 0.000 -0.076 -0.026
2_garagepl 0.0321 0.015 2.090 0.037 0.002 0.062
3_garagepl -0.0538 0.042 -1.292 0.197 -0.136 0.028
==============================================================================
Omnibus: 72.975 Durbin-Watson: 2.056
Prob(Omnibus): 0.000 Jarque-Bera (JB): 191.789
Skew: 0.914 Prob(JB): 2.26e-42
Kurtosis: 5.951 Cond. No. 38.3
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
| VIF | |
|---|---|
| Features | |
| 1_bathrms | 8.960000 |
| lotsize | 4.580000 |
| 2_bathrms | 4.140000 |
| 0_garagepl | 3.260000 |
| 2_stories | 2.490000 |
| 2_bedrooms | 2.040000 |
| fullbase | 1.990000 |
| 2_garagepl | 1.890000 |
| airco | 1.810000 |
| 4_stories | 1.630000 |
| recroom | 1.510000 |
| 3_stories | 1.410000 |
| 3_garagepl | 1.290000 |
| 4_bathrms | 1.240000 |
| 6_bedrooms | 1.030000 |
| 1_bedrooms | 1.020000 |
Analyis - Delete feature with highest p value i.e. '1_bedrooms' for a stable model
Model Attribute - Adj. R-squared: 0.651
x_new.pop('1_bedrooms')
x_new,lm_new = statsmodel_summary(y_train,x_new)
MODEL - 3
OLS Regression Results
==============================================================================
Dep. Variable: price R-squared: 0.665
Model: OLS Adj. R-squared: 0.651
Method: Least Squares F-statistic: 48.47
Date: Thu, 21 Jan 2021 Prob (F-statistic): 2.48e-77
Time: 14:41:59 Log-Likelihood: 357.86
No. Observations: 382 AIC: -683.7
Df Residuals: 366 BIC: -620.6
Df Model: 15
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const 0.2938 0.045 6.590 0.000 0.206 0.382
lotsize 0.2554 0.031 8.316 0.000 0.195 0.316
recroom 0.0429 0.014 2.996 0.003 0.015 0.071
fullbase 0.0435 0.012 3.619 0.000 0.020 0.067
airco 0.0709 0.012 5.960 0.000 0.048 0.094
2_bedrooms -0.0247 0.014 -1.808 0.071 -0.052 0.002
6_bedrooms 0.0478 0.069 0.689 0.491 -0.089 0.184
1_bathrms -0.1845 0.041 -4.538 0.000 -0.264 -0.105
2_bathrms -0.1009 0.041 -2.444 0.015 -0.182 -0.020
4_bathrms 0.3145 0.113 2.786 0.006 0.093 0.537
2_stories 0.0390 0.013 3.099 0.002 0.014 0.064
3_stories 0.1122 0.023 4.859 0.000 0.067 0.158
4_stories 0.1217 0.022 5.475 0.000 0.078 0.165
0_garagepl -0.0511 0.013 -4.044 0.000 -0.076 -0.026
2_garagepl 0.0323 0.015 2.101 0.036 0.002 0.062
3_garagepl -0.0535 0.042 -1.287 0.199 -0.135 0.028
==============================================================================
Omnibus: 72.452 Durbin-Watson: 2.060
Prob(Omnibus): 0.000 Jarque-Bera (JB): 188.914
Skew: 0.910 Prob(JB): 9.50e-42
Kurtosis: 5.925 Cond. No. 38.3
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
| VIF | |
|---|---|
| Features | |
| 1_bathrms | 8.910000 |
| lotsize | 4.580000 |
| 2_bathrms | 4.130000 |
| 0_garagepl | 3.250000 |
| 2_stories | 2.470000 |
| 2_bedrooms | 2.020000 |
| fullbase | 1.990000 |
| 2_garagepl | 1.890000 |
| airco | 1.810000 |
| 4_stories | 1.630000 |
| recroom | 1.510000 |
| 3_stories | 1.400000 |
| 3_garagepl | 1.290000 |
| 4_bathrms | 1.240000 |
| 6_bedrooms | 1.030000 |
Analyis - Delete feature with highest p value i.e. '6_bedrooms' for a stable model P> 0.05
Model Attribute - Adj. R-squared:0.651
x_new.pop('6_bedrooms')
x_new,lm_new = statsmodel_summary(y_train,x_new)
MODEL - 4
OLS Regression Results
==============================================================================
Dep. Variable: price R-squared: 0.665
Model: OLS Adj. R-squared: 0.652
Method: Least Squares F-statistic: 51.97
Date: Thu, 21 Jan 2021 Prob (F-statistic): 4.26e-78
Time: 14:41:59 Log-Likelihood: 357.61
No. Observations: 382 AIC: -685.2
Df Residuals: 367 BIC: -626.0
Df Model: 14
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const 0.2946 0.045 6.613 0.000 0.207 0.382
lotsize 0.2548 0.031 8.306 0.000 0.195 0.315
recroom 0.0428 0.014 2.989 0.003 0.015 0.071
fullbase 0.0430 0.012 3.586 0.000 0.019 0.067
airco 0.0705 0.012 5.937 0.000 0.047 0.094
2_bedrooms -0.0250 0.014 -1.833 0.068 -0.052 0.002
1_bathrms -0.1844 0.041 -4.540 0.000 -0.264 -0.105
2_bathrms -0.1003 0.041 -2.430 0.016 -0.181 -0.019
4_bathrms 0.3151 0.113 2.793 0.005 0.093 0.537
2_stories 0.0394 0.013 3.136 0.002 0.015 0.064
3_stories 0.1118 0.023 4.846 0.000 0.066 0.157
4_stories 0.1213 0.022 5.463 0.000 0.078 0.165
0_garagepl -0.0513 0.013 -4.068 0.000 -0.076 -0.027
2_garagepl 0.0319 0.015 2.081 0.038 0.002 0.062
3_garagepl -0.0537 0.042 -1.291 0.198 -0.135 0.028
==============================================================================
Omnibus: 72.219 Durbin-Watson: 2.061
Prob(Omnibus): 0.000 Jarque-Bera (JB): 187.354
Skew: 0.909 Prob(JB): 2.07e-41
Kurtosis: 5.909 Cond. No. 38.3
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
| VIF | |
|---|---|
| Features | |
| 1_bathrms | 8.880000 |
| lotsize | 4.580000 |
| 2_bathrms | 4.100000 |
| 0_garagepl | 3.250000 |
| 2_stories | 2.460000 |
| 2_bedrooms | 2.020000 |
| fullbase | 1.980000 |
| 2_garagepl | 1.890000 |
| airco | 1.800000 |
| 4_stories | 1.630000 |
| recroom | 1.510000 |
| 3_stories | 1.400000 |
| 3_garagepl | 1.290000 |
| 4_bathrms | 1.240000 |
Analyis - Delete feature with highest p value i.e. '3_garagepl' for a stable model P> 0.05
Model Attribute - Adj. R-squared:0.652
x_new.pop('3_garagepl')
x_new,lm_new = statsmodel_summary(y_train,x_new)
MODEL - 5
OLS Regression Results
==============================================================================
Dep. Variable: price R-squared: 0.663
Model: OLS Adj. R-squared: 0.651
Method: Least Squares F-statistic: 55.74
Date: Thu, 21 Jan 2021 Prob (F-statistic): 1.27e-78
Time: 14:41:59 Log-Likelihood: 356.75
No. Observations: 382 AIC: -685.5
Df Residuals: 368 BIC: -630.3
Df Model: 13
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const 0.2918 0.045 6.552 0.000 0.204 0.379
lotsize 0.2519 0.031 8.225 0.000 0.192 0.312
recroom 0.0437 0.014 3.057 0.002 0.016 0.072
fullbase 0.0424 0.012 3.538 0.000 0.019 0.066
airco 0.0703 0.012 5.916 0.000 0.047 0.094
2_bedrooms -0.0237 0.014 -1.735 0.084 -0.050 0.003
1_bathrms -0.1852 0.041 -4.557 0.000 -0.265 -0.105
2_bathrms -0.1005 0.041 -2.433 0.015 -0.182 -0.019
4_bathrms 0.2675 0.107 2.507 0.013 0.058 0.477
2_stories 0.0414 0.012 3.317 0.001 0.017 0.066
3_stories 0.1136 0.023 4.927 0.000 0.068 0.159
4_stories 0.1201 0.022 5.407 0.000 0.076 0.164
0_garagepl -0.0483 0.012 -3.894 0.000 -0.073 -0.024
2_garagepl 0.0358 0.015 2.379 0.018 0.006 0.065
==============================================================================
Omnibus: 71.506 Durbin-Watson: 2.062
Prob(Omnibus): 0.000 Jarque-Bera (JB): 184.814
Skew: 0.902 Prob(JB): 7.38e-41
Kurtosis: 5.891 Cond. No. 36.3
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
| VIF | |
|---|---|
| Features | |
| 1_bathrms | 8.700000 |
| lotsize | 4.540000 |
| 2_bathrms | 4.060000 |
| 0_garagepl | 3.150000 |
| 2_stories | 2.430000 |
| 2_bedrooms | 2.010000 |
| fullbase | 1.980000 |
| 2_garagepl | 1.820000 |
| airco | 1.800000 |
| 4_stories | 1.620000 |
| recroom | 1.500000 |
| 3_stories | 1.400000 |
| 4_bathrms | 1.080000 |
Analyis - Delete feature with highest p value i.e. '2_bedrooms' for a stable model P> 0.05
Model Attribute - Adj. R-squared:0.651
x_new.pop('2_bedrooms')
x_new,lm_new = statsmodel_summary(y_train,x_new)
MODEL - 6
OLS Regression Results
==============================================================================
Dep. Variable: price R-squared: 0.660
Model: OLS Adj. R-squared: 0.649
Method: Least Squares F-statistic: 59.81
Date: Thu, 21 Jan 2021 Prob (F-statistic): 7.00e-79
Time: 14:41:59 Log-Likelihood: 355.19
No. Observations: 382 AIC: -684.4
Df Residuals: 369 BIC: -633.1
Df Model: 12
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const 0.2779 0.044 6.327 0.000 0.192 0.364
lotsize 0.2558 0.031 8.354 0.000 0.196 0.316
recroom 0.0443 0.014 3.086 0.002 0.016 0.072
fullbase 0.0447 0.012 3.740 0.000 0.021 0.068
airco 0.0720 0.012 6.067 0.000 0.049 0.095
1_bathrms -0.1882 0.041 -4.621 0.000 -0.268 -0.108
2_bathrms -0.1006 0.041 -2.431 0.016 -0.182 -0.019
4_bathrms 0.2676 0.107 2.500 0.013 0.057 0.478
2_stories 0.0518 0.011 4.705 0.000 0.030 0.073
3_stories 0.1254 0.022 5.678 0.000 0.082 0.169
4_stories 0.1297 0.022 6.013 0.000 0.087 0.172
0_garagepl -0.0476 0.012 -3.828 0.000 -0.072 -0.023
2_garagepl 0.0392 0.015 2.618 0.009 0.010 0.069
==============================================================================
Omnibus: 70.071 Durbin-Watson: 2.064
Prob(Omnibus): 0.000 Jarque-Bera (JB): 174.216
Skew: 0.898 Prob(JB): 1.48e-38
Kurtosis: 5.778 Cond. No. 35.8
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
| VIF | |
|---|---|
| Features | |
| 1_bathrms | 6.560000 |
| lotsize | 4.540000 |
| 2_bathrms | 3.610000 |
| 0_garagepl | 3.150000 |
| fullbase | 1.970000 |
| 2_stories | 1.930000 |
| 2_garagepl | 1.800000 |
| airco | 1.790000 |
| 4_stories | 1.540000 |
| recroom | 1.500000 |
| 3_stories | 1.290000 |
| 4_bathrms | 1.080000 |
All p-values are less than 0.05
Deleting the features 1_bathrms where VIF > 5.
Adj. R-squared is 0.649
x_new.pop('1_bathrms')
x_new,lm_new = statsmodel_summary(y_train,x_new)
MODEL - 7
OLS Regression Results
==============================================================================
Dep. Variable: price R-squared: 0.641
Model: OLS Adj. R-squared: 0.630
Method: Least Squares F-statistic: 60.00
Date: Thu, 21 Jan 2021 Prob (F-statistic): 2.44e-75
Time: 14:41:59 Log-Likelihood: 344.45
No. Observations: 382 AIC: -664.9
Df Residuals: 370 BIC: -617.5
Df Model: 11
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const 0.0889 0.016 5.406 0.000 0.057 0.121
lotsize 0.2565 0.031 8.154 0.000 0.195 0.318
recroom 0.0420 0.015 2.851 0.005 0.013 0.071
fullbase 0.0504 0.012 4.126 0.000 0.026 0.074
airco 0.0721 0.012 5.911 0.000 0.048 0.096
2_bathrms 0.0809 0.013 6.032 0.000 0.055 0.107
4_bathrms 0.4460 0.103 4.351 0.000 0.244 0.648
2_stories 0.0561 0.011 4.988 0.000 0.034 0.078
3_stories 0.1287 0.023 5.679 0.000 0.084 0.173
4_stories 0.1398 0.022 6.343 0.000 0.096 0.183
0_garagepl -0.0481 0.013 -3.764 0.000 -0.073 -0.023
2_garagepl 0.0440 0.015 2.872 0.004 0.014 0.074
==============================================================================
Omnibus: 73.409 Durbin-Watson: 2.066
Prob(Omnibus): 0.000 Jarque-Bera (JB): 180.266
Skew: 0.944 Prob(JB): 7.17e-40
Kurtosis: 5.786 Cond. No. 28.5
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
| VIF | |
|---|---|
| Features | |
| lotsize | 2.960000 |
| fullbase | 1.940000 |
| 0_garagepl | 1.890000 |
| airco | 1.790000 |
| 2_bathrms | 1.700000 |
| 2_stories | 1.690000 |
| 2_garagepl | 1.630000 |
| 4_stories | 1.530000 |
| recroom | 1.500000 |
| 3_stories | 1.280000 |
| 4_bathrms | 1.060000 |
All p-values are less than 0.05
All the are VIF < 5.
Adj. R-squared is 0.630
display(x_new.head())
| const | lotsize | recroom | fullbase | airco | 2_bathrms | 4_bathrms | 2_stories | 3_stories | 4_stories | 0_garagepl | 2_garagepl | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 46 | 1.0 | 0.120346 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 50 | 1.0 | 0.316017 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 |
| 483 | 1.0 | 0.203463 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 19 | 1.0 | 0.202251 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 159 | 1.0 | 0.200693 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
y_train_price = lm_new.predict(x_new)
fig = plt.figure(figsize=(9,6))
sns.histplot((y_train - y_train_price), bins = 20)
fig.suptitle('Error Terms', fontsize = 20) # Plot heading
plt.xlabel('Errors', fontsize = 18) #x-axis
plt.show()
Analyis: Errors seems to normally distributed which signifies that model prediction is not by chance.
df_test.head()
df_test[rescale_cols] = scale.transform(df_test[rescale_cols])
display(df_test.head())
y_test = df_test.pop('price')
x_test = df_test
final_cols=x_new.columns.tolist()
final_cols.remove('const')
print(final_cols)
| price | lotsize | recroom | fullbase | airco | 1_bedrooms | 2_bedrooms | 3_bedrooms | 4_bedrooms | 5_bedrooms | ... | 3_bathrms | 4_bathrms | 1_stories | 2_stories | 3_stories | 4_stories | 0_garagepl | 1_garagepl | 2_garagepl | 3_garagepl | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 54 | 0.078788 | 0.171429 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
| 506 | 0.303030 | 0.705628 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
| 496 | 0.212121 | 0.246753 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 267 | 0.166667 | 0.168831 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 153 | 0.103030 | 0.168831 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
5 rows × 23 columns
['lotsize', 'recroom', 'fullbase', 'airco', '2_bathrms', '4_bathrms', '2_stories', '3_stories', '4_stories', '0_garagepl', '2_garagepl']
# Creating X_test_new dataframe by dropping variables from X_test
x_test_new = x_test.filter(final_cols)
# Adding a constant variable
x_test_new = sm.add_constant(x_test_new)
# Making predictions
y_pred = lm_new.predict(x_test_new)
#Calculating R square
print("R-square calculated for test data is : "+str(r2_score(y_test, y_pred)))
R-square calculated for test data is : 0.5565009459158257
# Plotting y_test and y_pred to understand the spread.
fig = plt.figure(figsize=(9,6))
plt.scatter(y_test,y_pred)
fig.suptitle('Price [ Test Vs Predicted ]', fontsize=20)
plt.xlabel('Test', fontsize=18)
plt.ylabel('Predicted', fontsize=18)
plt.show()
test_train=pd.DataFrame([y_test,y_pred]).T
test_train.columns=['Price','Price Predicted']
display(test_train.corr())
| Price | Price Predicted | |
|---|---|---|
| Price | 1.000000 | 0.746284 |
| Price Predicted | 0.746284 | 1.000000 |
Analyis: Predicted and Actual data is 75% positively correlated.
print(lm_new.summary())
OLS Regression Results
==============================================================================
Dep. Variable: price R-squared: 0.641
Model: OLS Adj. R-squared: 0.630
Method: Least Squares F-statistic: 60.00
Date: Thu, 21 Jan 2021 Prob (F-statistic): 2.44e-75
Time: 14:41:59 Log-Likelihood: 344.45
No. Observations: 382 AIC: -664.9
Df Residuals: 370 BIC: -617.5
Df Model: 11
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const 0.0889 0.016 5.406 0.000 0.057 0.121
lotsize 0.2565 0.031 8.154 0.000 0.195 0.318
recroom 0.0420 0.015 2.851 0.005 0.013 0.071
fullbase 0.0504 0.012 4.126 0.000 0.026 0.074
airco 0.0721 0.012 5.911 0.000 0.048 0.096
2_bathrms 0.0809 0.013 6.032 0.000 0.055 0.107
4_bathrms 0.4460 0.103 4.351 0.000 0.244 0.648
2_stories 0.0561 0.011 4.988 0.000 0.034 0.078
3_stories 0.1287 0.023 5.679 0.000 0.084 0.173
4_stories 0.1398 0.022 6.343 0.000 0.096 0.183
0_garagepl -0.0481 0.013 -3.764 0.000 -0.073 -0.023
2_garagepl 0.0440 0.015 2.872 0.004 0.014 0.074
==============================================================================
Omnibus: 73.409 Durbin-Watson: 2.066
Prob(Omnibus): 0.000 Jarque-Bera (JB): 180.266
Skew: 0.944 Prob(JB): 7.17e-40
Kurtosis: 5.786 Cond. No. 28.5
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
R-sqaured and Adjusted R-squared (extent of fit) is 0.641 and 0.63. 63% variance is explained.
F-stats and Prob(F-stats) (overall model fit) - 60 and 2.44e-75 which is approx. 0.0 - Model fit is significant and explained 63% variance which is not a random
p-values for the coefficients is less than the significance level of 5%, which means all the predictors are statistically significant.
Regression line: **price = 0.0889 + (0.2565 * lotsize) + (0.0420 * recroom) + (0.0504 * fullbase) + (0.0721 * airco) + (0.0809 * 2_bathrms) + (0.4460 * 4_bathrms) + (0.0561 * 2_stories) + (0.1287 * 3_stories) + (0.1398 * 4_stories) - (0.0481 * 0_garagepl) + (0.044 * 2_garagepl)**
As per final model attributes which are best suited for predicting **Price** are -
1. Lot Size
2. Recreation Room
3. Full Basement
4. Air Conditioned
5. 2 and 4 Bathroom
6. 2,3 and 4 stories
7. 0 and 2 Garage places
Please note that the **Price** decreases when there is **NO GARAGE PLACE**